#package
library(ggplot2)
#library part
library(ISLR2)
library(ltm)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
## Loading required package: msm
## Loading required package: polycor
library(tree)
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# EDA
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#data part
data(OJ)
data("Caravan")
head(OJ)
## Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH
## 1 CH 237 1 1.75 1.99 0.00 0.0 0
## 2 CH 239 1 1.75 1.99 0.00 0.3 0
## 3 CH 245 1 1.86 2.09 0.17 0.0 0
## 4 MM 227 1 1.69 1.69 0.00 0.0 0
## 5 CH 228 7 1.69 1.69 0.00 0.0 0
## 6 CH 230 7 1.69 1.99 0.00 0.0 0
## SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM
## 1 0 0.500000 1.99 1.75 0.24 No 0.000000
## 2 1 0.600000 1.69 1.75 -0.06 No 0.150754
## 3 0 0.680000 2.09 1.69 0.40 No 0.000000
## 4 0 0.400000 1.69 1.69 0.00 No 0.000000
## 5 0 0.956535 1.69 1.69 0.00 Yes 0.000000
## 6 1 0.965228 1.99 1.69 0.30 Yes 0.000000
## PctDiscCH ListPriceDiff STORE
## 1 0.000000 0.24 1
## 2 0.000000 0.24 1
## 3 0.091398 0.23 1
## 4 0.000000 0.00 1
## 5 0.000000 0.00 0
## 6 0.000000 0.30 0
str(OJ)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
data description) 오렌지 주스 브랜드인 Citrus Hill과 Minute Maid에 대한 데이터셋이다.
Attribute)
observations : 1070
variables : 18
1. Purchase : 소비자가 구매한 오렌지 주스의 브랜드 (CH 또는 MM)*
2. WeekofPurchase : 구입이 이뤄진 주차
3. StoreID : 상점 ID
4. PriceCH : 브랜드 CH의 가격*
5. PriceMM : 브랜드 MM의 가격*
6. DiscCH : 브랜드 CH의 할인 가격
7. DiscMM : 브랜드 MM의 할인 가격
8. SpecialCH : 브랜드 CH의 특별 행사 여부
9. SpecialMM : 브랜드 MM의 특별 행사 여부
10. LoyalCH : 브랜드 CH에 대한 고객 브랜드 충성도*
11. SalePriceMM : 브랜드 MM에 대한 할인된 가격(list price -
DiscMM)
12. SalePriceCH : 브랜드 CH에 대한 할인된 가격(list price -
DiscCH)
13. PriceDiff : 두 브랜드 간의 가격 차이(MM price - CH price)*
14. Store7 : 7번 상점에서 구매가 이뤄졌는지의 여부
15. PctDiscMM : MM의 할인율
16. PctDiscCH : CH의 할인율
17. ListPriceDiff : 두 브랜드간의 할인 전 가격 차이(MM -CH)*
18. STORE: 상점의 ID를 나타내는 변수, range : [0, 4]
summary(OJ)
## Purchase WeekofPurchase StoreID PriceCH PriceMM
## CH:653 Min. :227.0 Min. :1.00 Min. :1.690 Min. :1.690
## MM:417 1st Qu.:240.0 1st Qu.:2.00 1st Qu.:1.790 1st Qu.:1.990
## Median :257.0 Median :3.00 Median :1.860 Median :2.090
## Mean :254.4 Mean :3.96 Mean :1.867 Mean :2.085
## 3rd Qu.:268.0 3rd Qu.:7.00 3rd Qu.:1.990 3rd Qu.:2.180
## Max. :278.0 Max. :7.00 Max. :2.090 Max. :2.290
## DiscCH DiscMM SpecialCH SpecialMM
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.05186 Mean :0.1234 Mean :0.1477 Mean :0.1617
## 3rd Qu.:0.00000 3rd Qu.:0.2300 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :0.50000 Max. :0.8000 Max. :1.0000 Max. :1.0000
## LoyalCH SalePriceMM SalePriceCH PriceDiff Store7
## Min. :0.000011 Min. :1.190 Min. :1.390 Min. :-0.6700 No :714
## 1st Qu.:0.325257 1st Qu.:1.690 1st Qu.:1.750 1st Qu.: 0.0000 Yes:356
## Median :0.600000 Median :2.090 Median :1.860 Median : 0.2300
## Mean :0.565782 Mean :1.962 Mean :1.816 Mean : 0.1465
## 3rd Qu.:0.850873 3rd Qu.:2.130 3rd Qu.:1.890 3rd Qu.: 0.3200
## Max. :0.999947 Max. :2.290 Max. :2.090 Max. : 0.6400
## PctDiscMM PctDiscCH ListPriceDiff STORE
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.140 1st Qu.:0.000
## Median :0.0000 Median :0.00000 Median :0.240 Median :2.000
## Mean :0.0593 Mean :0.02731 Mean :0.218 Mean :1.631
## 3rd Qu.:0.1127 3rd Qu.:0.00000 3rd Qu.:0.300 3rd Qu.:3.000
## Max. :0.4020 Max. :0.25269 Max. :0.440 Max. :4.000
CH : 653 MM : 417
<Store 7> No : 714 Yes : 356
상식적으로 생각해봤을 때, 구매에 있어서 가장 중요한 요소들은 각 제품의 가격과 경쟁 제품간의 가격 차이 그리고 브랜드에 대한 충성도라고 유추할 수 있다. 따라서, PriceCH, PriceMM, PriceDiff, LoyalCH 변수들을 중점적으로 과제를 진행한다.
ggplot(OJ, aes(x = PriceCH, y = PriceMM, color = Purchase)) +
geom_point() +
labs(title = "SalePriceCH and SalePriceMM",
x = "Price_CH",
y = "Price_MM",
color = "Purchase") +
theme_minimal()
위의 scatter plot은 Price_CH와 Price_MM의 관계에 따른 Purchase의 종류를
확인한 것 이다. 점들을 보면, 같은 색상끼리 모여서 분포해 있는 것을
확인할 수 있다. 브랜드별 Price는 Purchase를 예측하는 데 유의미하다고
결론을 내린다.
ggplot(OJ, aes(x = Purchase, y = LoyalCH)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1) +
labs(title = "Violin Plot of LoyalCH by Purchase",
x = "Purchase",
y = "LoyalCH") +
theme_minimal()
CH에 대한 브랜드 충성도가 높을수록, CH의 제품 구매로 이뤄지는 경우가
많다는 사실을 확인했다. 따라서, LoyalCH는 Purchase 예측에 유의미한
예측변수로 사용될 수 있다는 결론을 내린다.
ggplot(OJ, aes(x = SalePriceCH, y = SalePriceMM, color = Purchase)) +
geom_point() +
labs(title = "SalePriceCH and SalePriceMM",
x = "SalePrice_CH",
y = "SalePrice_MM",
color = "Purchase") +
theme_minimal()
같은 색상의 점들끼리 대체적으로 군집을 이루고 있는 모습을 확인할 수
있다. 브랜드별 SalePrice 역시 Purchase를 예측하는 데 유의미하다고 판단을
내린다.
ggplot(OJ, aes(x = Purchase, y = PriceDiff)) + geom_boxplot()
PriceDiff = MM price - CH price
MM이 CH보다 비싸다면, 사람들은 상대적으로 저렴한 CH를 선택한다는 것을
확인할 수 있었다. 가격의 차이가 Purchase를 예측하는 데 유의미하다고
판단할 수 있다.
diagnose_outlier(OJ)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 WeekofPurchase 0 0.000000 NaN 254.38130841
## 2 StoreID 0 0.000000 NaN 3.95981308
## 3 PriceCH 0 0.000000 NaN 1.86742056
## 4 PriceMM 57 5.327103 1.6900000 2.08541121
## 5 DiscCH 232 21.682243 0.2391810 0.05185981
## 6 DiscMM 41 3.831776 0.7731707 0.12336449
## 7 SpecialCH 158 14.766355 1.0000000 0.14766355
## 8 SpecialMM 173 16.168224 1.0000000 0.16168224
## 9 LoyalCH 0 0.000000 NaN 0.56578233
## 10 SalePriceMM 0 0.000000 NaN 1.96204673
## 11 SalePriceCH 75 7.009346 1.4540000 1.81556075
## 12 PriceDiff 36 3.364486 -0.5947222 0.14648598
## 13 PctDiscMM 41 3.831776 0.3625908 0.05929844
## 14 PctDiscCH 232 21.682243 0.1259733 0.02731384
## 15 ListPriceDiff 0 0.000000 NaN 0.21799065
## 16 STORE 0 0.000000 NaN 1.63084112
## without_mean
## 1 254.38130841
## 2 3.95981308
## 3 1.86742056
## 4 2.10766041
## 5 0.00000000
## 6 0.09747328
## 7 0.00000000
## 8 0.00000000
## 9 0.56578233
## 10 1.96204673
## 11 1.84281407
## 12 0.17229207
## 13 0.04721390
## 14 0.00000000
## 15 0.21799065
## 16 1.63084112
PriceMM : 57(ratio : 5%)
DiscCH : 232(ratio : 22%)
DiscMM : 41(ratio : 4%)
SalePriceCH : 75(ratio : 7%)
PriceDiff : 36(ratio : 3%)
PctDiscMM : 41(ratio : 4%)
PctDiscCH : 232(ratio : 22%)
DiscCH의 경우, 이상치의 비율이 22%가 넘는다. 높은 이상치를 갖는다는
사실을 확인할 수 있었다.
SalePriceCH의 경우도 이상치가 7%로 높은 수준을 보이지만, 이것은 DiscCH의
변수의 영향으로 보인다.
PctDiscCH 변수 역시, DiscCH에 종속적이기 때문에 22%의 높은 이상치 비율을
갖는다고 결론을 내릴 수 있다.
plot_outlier(OJ)
remove_outliers_IQR = function(data, column_name) {
Q1 = quantile(data[[column_name]], 0.25, na.rm = TRUE)
Q3 = quantile(data[[column_name]], 0.75, na.rm = TRUE)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
cleaned_data = data[data[[column_name]] >= lower_bound & data[[column_name]] <= upper_bound, ]
return(cleaned_data)
}
removed_OJ = remove_outliers_IQR(OJ, "DiscCH")
diagnose_outlier(removed_OJ)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 WeekofPurchase 0 0.0000000 NaN 251.32935561
## 2 StoreID 0 0.0000000 NaN 3.69809069
## 3 PriceCH 0 0.0000000 NaN 1.85304296
## 4 PriceMM 51 6.0859189 1.6900000 2.08161098
## 5 DiscCH 0 0.0000000 NaN 0.00000000
## 6 DiscMM 41 4.8926014 0.7731707 0.12930788
## 7 SpecialCH 62 7.3985680 1.0000000 0.07398568
## 8 SpecialMM 156 18.6157518 1.0000000 0.18615752
## 9 LoyalCH 0 0.0000000 NaN 0.54701148
## 10 SalePriceMM 0 0.0000000 NaN 1.95230310
## 11 SalePriceCH 0 0.0000000 NaN 1.85304296
## 12 PriceDiff 7 0.8353222 -0.6700000 0.09926014
## 13 PctDiscMM 41 4.8926014 0.3625908 0.06224205
## 14 PctDiscCH 0 0.0000000 NaN 0.00000000
## 15 ListPriceDiff 92 10.9785203 0.0000000 0.22856802
## 16 STORE 0 0.0000000 NaN 1.66825776
## without_mean
## 1 251.32935561
## 2 3.69809069
## 3 1.85304296
## 4 2.10698856
## 5 0.00000000
## 6 0.09618570
## 7 0.00000000
## 8 0.00000000
## 9 0.54701148
## 10 1.95230310
## 11 1.85304296
## 12 0.10574007
## 13 0.04679124
## 14 0.00000000
## 15 0.25675603
## 16 1.66825776
DiscCH 변수의 이상치를 제거함으로써 SalePriceCH와 PctDiscCH의 outlier가 완전히 제거된 것을 확인할 수 있다.
plot_normality(removed_OJ)
Q-Q plot을 통해서 확인해 본 결과, 모든 데이터들이 정규 분포를 따르지는 않았다. log transformation과 sqrt transformation을 한 이후에도 정규분포의 형태는 찾아보기 힘들었다.
correlate(OJ)
## # A tibble: 240 × 3
## var1 var2 coef_corr
## <fct> <fct> <dbl>
## 1 StoreID WeekofPurchase 0.0960
## 2 PriceCH WeekofPurchase 0.704
## 3 PriceMM WeekofPurchase 0.577
## 4 DiscCH WeekofPurchase 0.366
## 5 DiscMM WeekofPurchase 0.242
## 6 SpecialCH WeekofPurchase 0.108
## 7 SpecialMM WeekofPurchase 0.0707
## 8 LoyalCH WeekofPurchase 0.193
## 9 SalePriceMM WeekofPurchase 0.102
## 10 SalePriceCH WeekofPurchase 0.201
## # ℹ 230 more rows
specialCH_cnt = sum(OJ$SpecialCH == 1)
specialMM_cnt = sum(OJ$SpecialMM == 1)
special_cnts = data.frame(
variable = c("SpecialCH", "SpecialMM"),
count = c(specialCH_cnt, specialMM_cnt)
)
ggplot(special_cnts, aes(x = variable, y = count, fill = variable)) +
geom_bar(stat = "identity", color = "black") +
labs(title = "Comparison of Indicator of special",
x = "Variable", y = "Count") +
scale_fill_manual(values = c("SpecialCH" = "blue", "SpecialMM" = "orange")) +
theme_minimal()
두 브랜드의 할인 횟수에 대한 빈도는 큰 차이를 보이지 않았다.
# remove missing value
removed_OJ = removed_OJ %>% filter(complete.cases(.))
categ = target_by(removed_OJ, Purchase)
cat_num = relate(categ, PriceCH)
plot(cat_num)
가격범위가 [1.8, 1.9] 구간에서는 CH가 우세하다는 것을 확인할 수
있었다.
nrow(OJ)
## [1] 1070
removed_OJ = remove_outliers_IQR(OJ, "DiscCH")
cat("size of removed_OJ : ", nrow(removed_OJ))
## size of removed_OJ : 838
결측치와 이상치를 제거하고 남은 observations : 838개
#split the data into two part.(train set : test set)
set.seed(111)
train_index = createDataPartition(removed_OJ$Purchase, p = 800 / nrow(removed_OJ), list = FALSE)
# 훈련 세트와 테스트 세트 생성
train_set = removed_OJ[train_index, ]
test_set = removed_OJ[-train_index, ]
cat("size of training set:", nrow(train_set), "\n")
## size of training set: 801
cat("size of test set:", nrow(test_set), "\n")
## size of test set: 37
cat("<ratio> \n")
## <ratio>
cat("training set : test set = ",nrow(train_set) / nrow(removed_OJ) ," : ", nrow(test_set) / nrow(removed_OJ))
## training set : test set = 0.9558473 : 0.04415274
training set의 observation을 800으로 하고 나머지 observation을 test set으로 사용한 결과, [training set : test set = 0.96 : 0.04] 로 test set이 상당히 부족했다. 이것은 결측치와 이상치를 제거한 결과이다.
#모든 변수 사용
tree_model_all = rpart(Purchase ~ ., data = train_set, method="class")
#일부 변수만 선택해서 사용
tree_model_selected = rpart(Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH,
data = train_set, method = "class")
#모든 변수 사용한 tree
summary(tree_model_all)
## Call:
## rpart(formula = Purchase ~ ., data = train_set, method = "class")
## n= 801
##
## CP nsplit rel error xerror xstd
## 1 0.53801170 0 1.0000000 1.0000000 0.04093333
## 2 0.02485380 1 0.4619883 0.5000000 0.03390986
## 3 0.01461988 3 0.4122807 0.5058480 0.03405340
## 4 0.01000000 4 0.3976608 0.4853801 0.03354274
##
## Variable importance
## LoyalCH ListPriceDiff PriceDiff PriceMM SalePriceMM
## 66 9 6 4 4
## WeekofPurchase DiscMM StoreID PriceCH SalePriceCH
## 4 2 2 1 1
##
## Node number 1: 801 observations, complexity param=0.5380117
## predicted class=CH expected loss=0.4269663 P(node) =1
## class counts: 459 342
## probabilities: 0.573 0.427
## left son=2 (483 obs) right son=3 (318 obs)
## Primary splits:
## LoyalCH < 0.48285 to the right, improve=138.47760, (0 missing)
## StoreID < 3.5 to the right, improve= 26.90500, (0 missing)
## PriceDiff < 0.015 to the right, improve= 19.64897, (0 missing)
## SalePriceMM < 1.84 to the right, improve= 19.58088, (0 missing)
## DiscMM < 0.47 to the left, improve= 17.82679, (0 missing)
## Surrogate splits:
## StoreID < 3.5 to the right, agree=0.614, adj=0.028, (0 split)
## PriceMM < 1.89 to the right, agree=0.612, adj=0.022, (0 split)
## ListPriceDiff < 0.035 to the right, agree=0.610, adj=0.019, (0 split)
## WeekofPurchase < 227.5 to the right, agree=0.609, adj=0.016, (0 split)
## DiscMM < 0.57 to the left, agree=0.609, adj=0.016, (0 split)
##
## Node number 2: 483 observations, complexity param=0.0248538
## predicted class=CH expected loss=0.1884058 P(node) =0.6029963
## class counts: 392 91
## probabilities: 0.812 0.188
## left son=4 (277 obs) right son=5 (206 obs)
## Primary splits:
## LoyalCH < 0.705699 to the right, improve=16.46708, (0 missing)
## SalePriceMM < 1.84 to the right, improve=15.70711, (0 missing)
## PriceDiff < 0.015 to the right, improve=14.62609, (0 missing)
## DiscMM < 0.47 to the left, improve=12.78514, (0 missing)
## PctDiscMM < 0.227263 to the left, improve=12.78514, (0 missing)
## Surrogate splits:
## WeekofPurchase < 237.5 to the right, agree=0.631, adj=0.136, (0 split)
## PriceCH < 1.755 to the right, agree=0.625, adj=0.121, (0 split)
## SalePriceCH < 1.755 to the right, agree=0.625, adj=0.121, (0 split)
## PriceMM < 2.04 to the right, agree=0.621, adj=0.112, (0 split)
## SpecialMM < 0.5 to the left, agree=0.598, adj=0.058, (0 split)
##
## Node number 3: 318 observations
## predicted class=MM expected loss=0.2106918 P(node) =0.3970037
## class counts: 67 251
## probabilities: 0.211 0.789
##
## Node number 4: 277 observations
## predicted class=CH expected loss=0.07581227 P(node) =0.3458177
## class counts: 256 21
## probabilities: 0.924 0.076
##
## Node number 5: 206 observations, complexity param=0.0248538
## predicted class=CH expected loss=0.3398058 P(node) =0.2571785
## class counts: 136 70
## probabilities: 0.660 0.340
## left son=10 (131 obs) right son=11 (75 obs)
## Primary splits:
## ListPriceDiff < 0.235 to the right, improve=17.647740, (0 missing)
## SalePriceMM < 1.84 to the right, improve=16.049010, (0 missing)
## PriceDiff < 0.145 to the right, improve=15.304000, (0 missing)
## DiscMM < 0.15 to the left, improve= 8.961921, (0 missing)
## PctDiscMM < 0.0729725 to the left, improve= 8.961921, (0 missing)
## Surrogate splits:
## PriceDiff < 0.235 to the right, agree=0.859, adj=0.613, (0 split)
## SalePriceMM < 1.94 to the right, agree=0.777, adj=0.387, (0 split)
## PriceMM < 1.89 to the right, agree=0.748, adj=0.307, (0 split)
## WeekofPurchase < 273.5 to the left, agree=0.733, adj=0.267, (0 split)
## DiscMM < 0.47 to the left, agree=0.709, adj=0.200, (0 split)
##
## Node number 10: 131 observations
## predicted class=CH expected loss=0.1832061 P(node) =0.1635456
## class counts: 107 24
## probabilities: 0.817 0.183
##
## Node number 11: 75 observations, complexity param=0.01461988
## predicted class=MM expected loss=0.3866667 P(node) =0.09363296
## class counts: 29 46
## probabilities: 0.387 0.613
## left son=22 (23 obs) right son=23 (52 obs)
## Primary splits:
## PriceDiff < 0.015 to the right, improve=3.270658, (0 missing)
## SalePriceMM < 1.84 to the right, improve=3.150053, (0 missing)
## PctDiscMM < 0.1961965 to the left, improve=3.055152, (0 missing)
## DiscMM < 0.15 to the left, improve=2.350005, (0 missing)
## LoyalCH < 0.5976 to the right, improve=2.084961, (0 missing)
## Surrogate splits:
## SalePriceMM < 1.84 to the right, agree=0.973, adj=0.913, (0 split)
## ListPriceDiff < 0.18 to the right, agree=0.787, adj=0.304, (0 split)
## WeekofPurchase < 275.5 to the right, agree=0.733, adj=0.130, (0 split)
## PctDiscMM < 0.0980985 to the left, agree=0.733, adj=0.130, (0 split)
## DiscMM < 0.15 to the left, agree=0.707, adj=0.043, (0 split)
##
## Node number 22: 23 observations
## predicted class=CH expected loss=0.3913043 P(node) =0.02871411
## class counts: 14 9
## probabilities: 0.609 0.391
##
## Node number 23: 52 observations
## predicted class=MM expected loss=0.2884615 P(node) =0.06491885
## class counts: 15 37
## probabilities: 0.288 0.712
print(tree_model_all$variable.importance)
## LoyalCH ListPriceDiff PriceDiff PriceMM SalePriceMM
## 154.9446916 21.2559470 14.0946076 10.2987813 9.8100472
## WeekofPurchase DiscMM StoreID PriceCH SalePriceCH
## 9.5482370 5.8490724 3.9191778 1.9984316 1.9984316
## SpecialMM PctDiscMM
## 0.9592471 0.4266075
모든 변수를 사용해서 model을 만들었을 때, LoyalCH가 66으로 모델에서 가장 많이 사용되었다.
train_pred_all = predict(tree_model_all, train_set, type = "class")
test_pred_all = predict(tree_model_all, test_set, type = "class")
train_conf_matrix_all = table(train_set$Purchase, train_pred_all)
test_conf_matrix_all = table(test_set$Purchase, test_pred_all)
# 훈련 오류율 계산
train_error_rate = 1 - sum(diag(train_conf_matrix_all)) / sum(train_conf_matrix_all)
test_error_rate = 1 - sum(diag(test_conf_matrix_all)) / sum(test_conf_matrix_all)
cat("train Error Rate:", train_error_rate, "\n")
## train Error Rate: 0.1697878
cat("test Error Rate:", test_error_rate, "\n")
## test Error Rate: 0.2162162
# 단말 노드의 개수 확인
num_terminal_nodes = sum(tree_model_all$frame$var == "<leaf>")
cat("Number of Terminal Nodes:", num_terminal_nodes, "\n")
## Number of Terminal Nodes: 5
train Error Rate : 17%
test Error Rate : 22%
number of leaf nodes : 5
summary(tree_model_selected)
## Call:
## rpart(formula = Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH,
## data = train_set, method = "class")
## n= 801
##
## CP nsplit rel error xerror xstd
## 1 0.53801170 0 1.0000000 1.0000000 0.04093333
## 2 0.01461988 1 0.4619883 0.4766082 0.03331668
## 3 0.01000000 5 0.3976608 0.4619883 0.03293000
##
## Variable importance
## LoyalCH PriceDiff PriceMM PriceCH
## 80 11 6 3
##
## Node number 1: 801 observations, complexity param=0.5380117
## predicted class=CH expected loss=0.4269663 P(node) =1
## class counts: 459 342
## probabilities: 0.573 0.427
## left son=2 (483 obs) right son=3 (318 obs)
## Primary splits:
## LoyalCH < 0.48285 to the right, improve=138.477600, (0 missing)
## PriceDiff < 0.015 to the right, improve= 19.648970, (0 missing)
## PriceMM < 1.89 to the right, improve= 6.509978, (0 missing)
## PriceCH < 1.925 to the left, improve= 1.893242, (0 missing)
## Surrogate splits:
## PriceMM < 1.89 to the right, agree=0.612, adj=0.022, (0 split)
## PriceDiff < -0.575 to the right, agree=0.608, adj=0.013, (0 split)
##
## Node number 2: 483 observations, complexity param=0.01461988
## predicted class=CH expected loss=0.1884058 P(node) =0.6029963
## class counts: 392 91
## probabilities: 0.812 0.188
## left son=4 (277 obs) right son=5 (206 obs)
## Primary splits:
## LoyalCH < 0.705699 to the right, improve=16.467080, (0 missing)
## PriceDiff < 0.015 to the right, improve=14.626090, (0 missing)
## PriceMM < 1.74 to the right, improve= 5.876976, (0 missing)
## PriceCH < 1.72 to the right, improve= 1.927536, (0 missing)
## Surrogate splits:
## PriceCH < 1.755 to the right, agree=0.625, adj=0.121, (0 split)
## PriceMM < 2.04 to the right, agree=0.621, adj=0.112, (0 split)
## PriceDiff < 0.015 to the right, agree=0.588, adj=0.034, (0 split)
##
## Node number 3: 318 observations
## predicted class=MM expected loss=0.2106918 P(node) =0.3970037
## class counts: 67 251
## probabilities: 0.211 0.789
##
## Node number 4: 277 observations
## predicted class=CH expected loss=0.07581227 P(node) =0.3458177
## class counts: 256 21
## probabilities: 0.924 0.076
##
## Node number 5: 206 observations, complexity param=0.01461988
## predicted class=CH expected loss=0.3398058 P(node) =0.2571785
## class counts: 136 70
## probabilities: 0.660 0.340
## left son=10 (114 obs) right son=11 (92 obs)
## Primary splits:
## PriceDiff < 0.145 to the right, improve=15.3040000, (0 missing)
## PriceMM < 2.155 to the right, improve= 4.0255650, (0 missing)
## LoyalCH < 0.6840375 to the left, improve= 2.8009220, (0 missing)
## PriceCH < 1.72 to the right, improve= 0.7231615, (0 missing)
## Surrogate splits:
## PriceMM < 2.04 to the right, agree=0.723, adj=0.380, (0 split)
## PriceCH < 1.755 to the right, agree=0.626, adj=0.163, (0 split)
## LoyalCH < 0.6955035 to the left, agree=0.568, adj=0.033, (0 split)
##
## Node number 10: 114 observations
## predicted class=CH expected loss=0.1666667 P(node) =0.1423221
## class counts: 95 19
## probabilities: 0.833 0.167
##
## Node number 11: 92 observations, complexity param=0.01461988
## predicted class=MM expected loss=0.4456522 P(node) =0.1148564
## class counts: 41 51
## probabilities: 0.446 0.554
## left son=22 (59 obs) right son=23 (33 obs)
## Primary splits:
## PriceDiff < -0.165 to the right, improve=4.2505640, (0 missing)
## PriceCH < 1.755 to the left, improve=1.4818630, (0 missing)
## LoyalCH < 0.5039495 to the right, improve=1.3921350, (0 missing)
## PriceMM < 2.11 to the left, improve=0.5869565, (0 missing)
## Surrogate splits:
## PriceCH < 1.91 to the left, agree=0.783, adj=0.394, (0 split)
## PriceMM < 2.11 to the left, agree=0.717, adj=0.212, (0 split)
## LoyalCH < 0.6111595 to the left, agree=0.652, adj=0.030, (0 split)
##
## Node number 22: 59 observations, complexity param=0.01461988
## predicted class=CH expected loss=0.440678 P(node) =0.07365793
## class counts: 33 26
## probabilities: 0.559 0.441
## left son=44 (42 obs) right son=45 (17 obs)
## Primary splits:
## LoyalCH < 0.5039495 to the right, improve=2.034326, (0 missing)
## PriceMM < 1.74 to the right, improve=1.538646, (0 missing)
## PriceDiff < -0.09 to the left, improve=1.374719, (0 missing)
## PriceCH < 1.72 to the right, improve=1.017856, (0 missing)
##
## Node number 23: 33 observations
## predicted class=MM expected loss=0.2424242 P(node) =0.0411985
## class counts: 8 25
## probabilities: 0.242 0.758
##
## Node number 44: 42 observations
## predicted class=CH expected loss=0.3571429 P(node) =0.05243446
## class counts: 27 15
## probabilities: 0.643 0.357
##
## Node number 45: 17 observations
## predicted class=MM expected loss=0.3529412 P(node) =0.02122347
## class counts: 6 11
## probabilities: 0.353 0.647
print(tree_model_selected$variable.importance)
## LoyalCH PriceDiff PriceMM PriceCH
## 157.606866 21.855978 11.610614 6.168113
변수의 일부를 선택해서 model을 생성했을 때도, LoyalCH 변수의 중요도가 높은 것을 확인할 수 있었다. 소비자들의 제품 선택에 있어서 브랜드 충성도가 가장 중요한 요소라는 것을 알 수 있었다.
train_pred_selected = predict(tree_model_selected, train_set, type = "class")
test_pred_selected = predict(tree_model_selected, test_set, type = "class")
train_conf_matrix_selected = table(train_set$Purchase, train_pred_all)
test_conf_matrix_selected = table(test_set$Purchase, test_pred_all)
# 훈련 오류율 계산
train_error_rate = 1 - sum(diag(train_conf_matrix_selected)) / sum(train_conf_matrix_selected)
test_error_rate_ = 1 - sum(diag(test_conf_matrix_selected)) / sum(test_conf_matrix_selected)
cat("train Error Rate:", train_error_rate, "\n")
## train Error Rate: 0.1697878
cat("test Error Rate:", test_error_rate, "\n")
## test Error Rate: 0.2162162
# 단말 노드의 개수 확인
num_terminal_nodes = sum(tree_model_selected$frame$var == "<leaf>")
cat("Number of Terminal Nodes:", num_terminal_nodes, "\n")
## Number of Terminal Nodes: 6
변수를 선택해서 넣은 경우의 training error rate: 17%, test error rate: 22%로 모든 변수를 넣은 경우와 유의미한 차이를 보이지 않았다. 다만, leaf node의 수는 변수를 선택해서 넣은 경우가 6개로 모든 변수를 사용했을 때 보다 1개 더 많은 것을 확인할 수 있었다.
node 내 class 분포의 미세한 변화에 덜 민감하게 반응하는
classification error rate 지표의 특성 때문에 2가지 모델에서 동일한 error
rate이 얻어진 것이라고 생각해서 Gini- Index와 Cross-Entropy 지표도
확인하기로 한다.
두 모델이 유의미한 결과의 차이를 보이지 않는 또다른 이유로는 test
data set의 부족때문이라고 예상할 수 있다.
calc_gini_entropy = function(data, model) {
pred = predict(model, data, type = "prob")
actual = data$Purchase
gini = 0
cross_entropy = 0
levels = levels(actual)
for (i in seq_along(levels)) {
level = levels[i]
actual_binary = ifelse(actual == level, 1, 0)
pred_prob = pred[, level]
gini = gini + mean((pred_prob * (1 - pred_prob)))
cross_entropy = cross_entropy - mean(actual_binary * log(pred_prob + 1e-10))
}
return(list(Gini = gini, CrossEntropy = cross_entropy))
}
train_results_all = calc_gini_entropy(train_set, tree_model_all)
test_results_all = calc_gini_entropy(test_set, tree_model_all)
train_results_selected = calc_gini_entropy(train_set, tree_model_selected)
test_results_selected = calc_gini_entropy(test_set, tree_model_selected)
cat("<Tree model all> \n",
"**train_set** \n",
"Training Set Gini Index:", train_results_all$Gini, "\n" ,
"Training Set Cross-Entropy:", train_results_all$CrossEntropy, "\n")
## <Tree model all>
## **train_set**
## Training Set Gini Index: 0.2697777
## Training Set Cross-Entropy: 0.4333347
cat("**test_set** \n",
"Test Set Gini Index:", test_results_all$Gini, "\n",
"Test Set Cross-Entropy:", test_results_all$CrossEntropy, "\n")
## **test_set**
## Test Set Gini Index: 0.2603624
## Test Set Cross-Entropy: 0.5127495
cat("\n <Tree model selected> \n **train_set**\n",
"Train Set Gini Index:", train_results_selected$Gini, "\n",
"Train Set Cross-Entropy:", train_results_selected$CrossEntropy, "\n")
##
## <Tree model selected>
## **train_set**
## Train Set Gini Index: 0.2689407
## Train Set Cross-Entropy: 0.4321273
cat("**test_set** \n",
"Test Set Gini Index:", test_results_selected$Gini, "\n",
"Test Set Cross-Entropy:", test_results_selected$CrossEntropy, "\n")
## **test_set**
## Test Set Gini Index: 0.2503403
## Test Set Cross-Entropy: 0.5086116
2가지 모델 전부, test set 에 대해서는 gini index와 cross entropy의 수치가 높아지는 것을 확인할 수 있다. 미세한 차이지만 predictor를 일부만 사용한 경우가 모든 predictor를 사용했을 때보다 낮은 것을 볼 수 있다.
summary(tree_model_all)
## Call:
## rpart(formula = Purchase ~ ., data = train_set, method = "class")
## n= 801
##
## CP nsplit rel error xerror xstd
## 1 0.53801170 0 1.0000000 1.0000000 0.04093333
## 2 0.02485380 1 0.4619883 0.5000000 0.03390986
## 3 0.01461988 3 0.4122807 0.5058480 0.03405340
## 4 0.01000000 4 0.3976608 0.4853801 0.03354274
##
## Variable importance
## LoyalCH ListPriceDiff PriceDiff PriceMM SalePriceMM
## 66 9 6 4 4
## WeekofPurchase DiscMM StoreID PriceCH SalePriceCH
## 4 2 2 1 1
##
## Node number 1: 801 observations, complexity param=0.5380117
## predicted class=CH expected loss=0.4269663 P(node) =1
## class counts: 459 342
## probabilities: 0.573 0.427
## left son=2 (483 obs) right son=3 (318 obs)
## Primary splits:
## LoyalCH < 0.48285 to the right, improve=138.47760, (0 missing)
## StoreID < 3.5 to the right, improve= 26.90500, (0 missing)
## PriceDiff < 0.015 to the right, improve= 19.64897, (0 missing)
## SalePriceMM < 1.84 to the right, improve= 19.58088, (0 missing)
## DiscMM < 0.47 to the left, improve= 17.82679, (0 missing)
## Surrogate splits:
## StoreID < 3.5 to the right, agree=0.614, adj=0.028, (0 split)
## PriceMM < 1.89 to the right, agree=0.612, adj=0.022, (0 split)
## ListPriceDiff < 0.035 to the right, agree=0.610, adj=0.019, (0 split)
## WeekofPurchase < 227.5 to the right, agree=0.609, adj=0.016, (0 split)
## DiscMM < 0.57 to the left, agree=0.609, adj=0.016, (0 split)
##
## Node number 2: 483 observations, complexity param=0.0248538
## predicted class=CH expected loss=0.1884058 P(node) =0.6029963
## class counts: 392 91
## probabilities: 0.812 0.188
## left son=4 (277 obs) right son=5 (206 obs)
## Primary splits:
## LoyalCH < 0.705699 to the right, improve=16.46708, (0 missing)
## SalePriceMM < 1.84 to the right, improve=15.70711, (0 missing)
## PriceDiff < 0.015 to the right, improve=14.62609, (0 missing)
## DiscMM < 0.47 to the left, improve=12.78514, (0 missing)
## PctDiscMM < 0.227263 to the left, improve=12.78514, (0 missing)
## Surrogate splits:
## WeekofPurchase < 237.5 to the right, agree=0.631, adj=0.136, (0 split)
## PriceCH < 1.755 to the right, agree=0.625, adj=0.121, (0 split)
## SalePriceCH < 1.755 to the right, agree=0.625, adj=0.121, (0 split)
## PriceMM < 2.04 to the right, agree=0.621, adj=0.112, (0 split)
## SpecialMM < 0.5 to the left, agree=0.598, adj=0.058, (0 split)
##
## Node number 3: 318 observations
## predicted class=MM expected loss=0.2106918 P(node) =0.3970037
## class counts: 67 251
## probabilities: 0.211 0.789
##
## Node number 4: 277 observations
## predicted class=CH expected loss=0.07581227 P(node) =0.3458177
## class counts: 256 21
## probabilities: 0.924 0.076
##
## Node number 5: 206 observations, complexity param=0.0248538
## predicted class=CH expected loss=0.3398058 P(node) =0.2571785
## class counts: 136 70
## probabilities: 0.660 0.340
## left son=10 (131 obs) right son=11 (75 obs)
## Primary splits:
## ListPriceDiff < 0.235 to the right, improve=17.647740, (0 missing)
## SalePriceMM < 1.84 to the right, improve=16.049010, (0 missing)
## PriceDiff < 0.145 to the right, improve=15.304000, (0 missing)
## DiscMM < 0.15 to the left, improve= 8.961921, (0 missing)
## PctDiscMM < 0.0729725 to the left, improve= 8.961921, (0 missing)
## Surrogate splits:
## PriceDiff < 0.235 to the right, agree=0.859, adj=0.613, (0 split)
## SalePriceMM < 1.94 to the right, agree=0.777, adj=0.387, (0 split)
## PriceMM < 1.89 to the right, agree=0.748, adj=0.307, (0 split)
## WeekofPurchase < 273.5 to the left, agree=0.733, adj=0.267, (0 split)
## DiscMM < 0.47 to the left, agree=0.709, adj=0.200, (0 split)
##
## Node number 10: 131 observations
## predicted class=CH expected loss=0.1832061 P(node) =0.1635456
## class counts: 107 24
## probabilities: 0.817 0.183
##
## Node number 11: 75 observations, complexity param=0.01461988
## predicted class=MM expected loss=0.3866667 P(node) =0.09363296
## class counts: 29 46
## probabilities: 0.387 0.613
## left son=22 (23 obs) right son=23 (52 obs)
## Primary splits:
## PriceDiff < 0.015 to the right, improve=3.270658, (0 missing)
## SalePriceMM < 1.84 to the right, improve=3.150053, (0 missing)
## PctDiscMM < 0.1961965 to the left, improve=3.055152, (0 missing)
## DiscMM < 0.15 to the left, improve=2.350005, (0 missing)
## LoyalCH < 0.5976 to the right, improve=2.084961, (0 missing)
## Surrogate splits:
## SalePriceMM < 1.84 to the right, agree=0.973, adj=0.913, (0 split)
## ListPriceDiff < 0.18 to the right, agree=0.787, adj=0.304, (0 split)
## WeekofPurchase < 275.5 to the right, agree=0.733, adj=0.130, (0 split)
## PctDiscMM < 0.0980985 to the left, agree=0.733, adj=0.130, (0 split)
## DiscMM < 0.15 to the left, agree=0.707, adj=0.043, (0 split)
##
## Node number 22: 23 observations
## predicted class=CH expected loss=0.3913043 P(node) =0.02871411
## class counts: 14 9
## probabilities: 0.609 0.391
##
## Node number 23: 52 observations
## predicted class=MM expected loss=0.2884615 P(node) =0.06491885
## class counts: 15 37
## probabilities: 0.288 0.712
10번노드를 선택해서 정보를 해석 해보려 한다.
Node number : 10
131 observations
predicted class=CH
expected loss=0.1832061 P(node) =0.1635456
class counts: 107 24
probabilities: 0.817 0.183
해당 노드에는 총 131개의 관측치가 존재한다.
해당 노드에서 예측된 클래스는 ‘CH’ 이다.
이 노드의 예측 손실은 18% 이다.
P(node) = 0.1635456 으로 이 노드가 전체 데이터에서 차지하는 비율을
의미한다.
‘CH’ 클래스의 확률은 81.7%, ‘MM’ 클래스의 확률은 18.3%가 된다.
summary(tree_model_selected)
## Call:
## rpart(formula = Purchase ~ PriceCH + PriceMM + PriceDiff + LoyalCH,
## data = train_set, method = "class")
## n= 801
##
## CP nsplit rel error xerror xstd
## 1 0.53801170 0 1.0000000 1.0000000 0.04093333
## 2 0.01461988 1 0.4619883 0.4766082 0.03331668
## 3 0.01000000 5 0.3976608 0.4619883 0.03293000
##
## Variable importance
## LoyalCH PriceDiff PriceMM PriceCH
## 80 11 6 3
##
## Node number 1: 801 observations, complexity param=0.5380117
## predicted class=CH expected loss=0.4269663 P(node) =1
## class counts: 459 342
## probabilities: 0.573 0.427
## left son=2 (483 obs) right son=3 (318 obs)
## Primary splits:
## LoyalCH < 0.48285 to the right, improve=138.477600, (0 missing)
## PriceDiff < 0.015 to the right, improve= 19.648970, (0 missing)
## PriceMM < 1.89 to the right, improve= 6.509978, (0 missing)
## PriceCH < 1.925 to the left, improve= 1.893242, (0 missing)
## Surrogate splits:
## PriceMM < 1.89 to the right, agree=0.612, adj=0.022, (0 split)
## PriceDiff < -0.575 to the right, agree=0.608, adj=0.013, (0 split)
##
## Node number 2: 483 observations, complexity param=0.01461988
## predicted class=CH expected loss=0.1884058 P(node) =0.6029963
## class counts: 392 91
## probabilities: 0.812 0.188
## left son=4 (277 obs) right son=5 (206 obs)
## Primary splits:
## LoyalCH < 0.705699 to the right, improve=16.467080, (0 missing)
## PriceDiff < 0.015 to the right, improve=14.626090, (0 missing)
## PriceMM < 1.74 to the right, improve= 5.876976, (0 missing)
## PriceCH < 1.72 to the right, improve= 1.927536, (0 missing)
## Surrogate splits:
## PriceCH < 1.755 to the right, agree=0.625, adj=0.121, (0 split)
## PriceMM < 2.04 to the right, agree=0.621, adj=0.112, (0 split)
## PriceDiff < 0.015 to the right, agree=0.588, adj=0.034, (0 split)
##
## Node number 3: 318 observations
## predicted class=MM expected loss=0.2106918 P(node) =0.3970037
## class counts: 67 251
## probabilities: 0.211 0.789
##
## Node number 4: 277 observations
## predicted class=CH expected loss=0.07581227 P(node) =0.3458177
## class counts: 256 21
## probabilities: 0.924 0.076
##
## Node number 5: 206 observations, complexity param=0.01461988
## predicted class=CH expected loss=0.3398058 P(node) =0.2571785
## class counts: 136 70
## probabilities: 0.660 0.340
## left son=10 (114 obs) right son=11 (92 obs)
## Primary splits:
## PriceDiff < 0.145 to the right, improve=15.3040000, (0 missing)
## PriceMM < 2.155 to the right, improve= 4.0255650, (0 missing)
## LoyalCH < 0.6840375 to the left, improve= 2.8009220, (0 missing)
## PriceCH < 1.72 to the right, improve= 0.7231615, (0 missing)
## Surrogate splits:
## PriceMM < 2.04 to the right, agree=0.723, adj=0.380, (0 split)
## PriceCH < 1.755 to the right, agree=0.626, adj=0.163, (0 split)
## LoyalCH < 0.6955035 to the left, agree=0.568, adj=0.033, (0 split)
##
## Node number 10: 114 observations
## predicted class=CH expected loss=0.1666667 P(node) =0.1423221
## class counts: 95 19
## probabilities: 0.833 0.167
##
## Node number 11: 92 observations, complexity param=0.01461988
## predicted class=MM expected loss=0.4456522 P(node) =0.1148564
## class counts: 41 51
## probabilities: 0.446 0.554
## left son=22 (59 obs) right son=23 (33 obs)
## Primary splits:
## PriceDiff < -0.165 to the right, improve=4.2505640, (0 missing)
## PriceCH < 1.755 to the left, improve=1.4818630, (0 missing)
## LoyalCH < 0.5039495 to the right, improve=1.3921350, (0 missing)
## PriceMM < 2.11 to the left, improve=0.5869565, (0 missing)
## Surrogate splits:
## PriceCH < 1.91 to the left, agree=0.783, adj=0.394, (0 split)
## PriceMM < 2.11 to the left, agree=0.717, adj=0.212, (0 split)
## LoyalCH < 0.6111595 to the left, agree=0.652, adj=0.030, (0 split)
##
## Node number 22: 59 observations, complexity param=0.01461988
## predicted class=CH expected loss=0.440678 P(node) =0.07365793
## class counts: 33 26
## probabilities: 0.559 0.441
## left son=44 (42 obs) right son=45 (17 obs)
## Primary splits:
## LoyalCH < 0.5039495 to the right, improve=2.034326, (0 missing)
## PriceMM < 1.74 to the right, improve=1.538646, (0 missing)
## PriceDiff < -0.09 to the left, improve=1.374719, (0 missing)
## PriceCH < 1.72 to the right, improve=1.017856, (0 missing)
##
## Node number 23: 33 observations
## predicted class=MM expected loss=0.2424242 P(node) =0.0411985
## class counts: 8 25
## probabilities: 0.242 0.758
##
## Node number 44: 42 observations
## predicted class=CH expected loss=0.3571429 P(node) =0.05243446
## class counts: 27 15
## probabilities: 0.643 0.357
##
## Node number 45: 17 observations
## predicted class=MM expected loss=0.3529412 P(node) =0.02122347
## class counts: 6 11
## probabilities: 0.353 0.647
Node number 4
277 observations
predicted class=CH expected
loss=0.07581227 P(node) =0.3458177
class counts: 256 21
probabilities: 0.924 0.076
해당 노드는 전체 227개의 관측치를 포함하며, 이 중 256개가 ‘CH’
클래스, 21개가 ‘MM’ 클래스이다.
이 노드는 ‘CH’ 클래스로 예측되고 있다.
이 노드의 예측 손실은 0.0758이고 약 7.58%의 관측치가 오분류되었음을
의미한다.
P(node) = 0.3458177 : 전체 데이터의 약 34.58%가 이 노드에 속하고
있다.
rpart.plot(tree_model_all)
해당 트리에 사용된 predictors : [LoyalCH, ListPriceDiff,
PriceDiff]
root node : LoyalCH(기준 : 0.48) LoyalCH가 0.48보다 같거나 크다면,
오른쪽으로 그렇지 않다면 왼쪽으로 데이터를 분할한다. 총 5개의 leaf
nodes가 존재하고, CH로 분류하는 단말노드는 3개, MM으로 분류하는
단발노드는 2개이다.
rpart.plot(tree_model_selected)
해당 트리에 사용된 predictors : [LoyalCH, PriceDiff]
root node : LoyalCH(기준 : 0.48) 이전의 모델과 동일하게 LoyalCH가
0.48보다 같거나 크다면, 오른쪽으로 그렇지 않다면 왼쪽으로 데이터를
분할한다. 총 6개의 leaf nodes가 존재하고, CH로 분류하는 단말노드는 3개,
MM으로 분류하는 단발노드 역시 동일하게 3개이다.
test_conf_matrix_all
## test_pred_all
## CH MM
## CH 17 4
## MM 4 12
test_error_rate_all = 1 - sum(diag(test_conf_matrix_all)) / sum(test_conf_matrix_all)
cat("test error rate : ", test_error_rate_all)
## test error rate : 0.2162162
test_conf_matrix_selected
## test_pred_all
## CH MM
## CH 17 4
## MM 4 12
test_error_rate_selected = 1 - sum(diag(test_conf_matrix_selected)) / sum(test_conf_matrix_selected)
cat("test error rate : ", test_error_rate_selected)
## test error rate : 0.2162162
모든 변수를 사용해서 만든 모델과 변수를 선별해서 생성한 모델 간 유의미한 차이가 없기 때문에, 모든 이후로 모든 변수를 이용해서 생성한 모델에 대해서만 기술한다.
tree_model_cv = tree(Purchase ~ ., data = train_set)
cv_tree = cv.tree(tree_model_cv, FUN = prune.misclass)
print(cv_tree)
## $size
## [1] 8 6 2 1
##
## $dev
## [1] 170 170 168 342
##
## $k
## [1] -Inf 0.00 7.25 184.00
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
optimal_size = cv_tree$size[which.min(cv_tree$dev)]
cat("optimal_size: ", optimal_size)
## optimal_size: 2
cv_data = data.frame(size = cv_tree$size, error = cv_tree$dev /length(train_set$Purchase))
ggplot(cv_data, aes(x = size, y = error)) +
geom_point() +
geom_line() +
labs(x = "Tree Size", y = "Cross-Validated Classification Error Rate") +
theme_minimal()
#create model
rpart_tree_model = rpart(Purchase ~ ., data = train_set, method = "class")
printcp(rpart_tree_model)
##
## Classification tree:
## rpart(formula = Purchase ~ ., data = train_set, method = "class")
##
## Variables actually used in tree construction:
## [1] ListPriceDiff LoyalCH PriceDiff
##
## Root node error: 342/801 = 0.42697
##
## n= 801
##
## CP nsplit rel error xerror xstd
## 1 0.538012 0 1.00000 1.00000 0.040933
## 2 0.024854 1 0.46199 0.49415 0.033764
## 3 0.014620 3 0.41228 0.47661 0.033317
## 4 0.010000 4 0.39766 0.46199 0.032930
plotcp(rpart_tree_model)
cv_data
## size error
## 1 8 0.2122347
## 2 6 0.2122347
## 3 2 0.2097378
## 4 1 0.4269663
tree size가 6 또는 8 일 때, error rate이 0.207으로 가장 낮았다. 다만, tree size가 1에서 2로 변화할 때, error rate이 크게 감소하고 이후에는 눈에 띄는 변화는 관찰되지 않으므로 variance를 낮추는 관점에서 size 2가 적절하다는 결론을 내린다.
optimal_cp = rpart_tree_model$cptable[which.min(rpart_tree_model$cptable[,"xerror"]),"CP"]
optimal_size = rpart_tree_model$cptable[which.min(rpart_tree_model$cptable[,"xerror"]),"nsplit"] + 1
cat("Optimal CP value:", optimal_cp, "\n")
## Optimal CP value: 0.01
cat("Optimal Tree Size:", optimal_size, "\n")
## Optimal Tree Size: 5
\(R_a(T)=R(T) + a|t|\) 위의 수식에서 \(a\)가 0.01 일 때, optimal이 된다.
rpart.plot(rpart_tree_model, main = "Origin Tree")
pruned_rpart_tree_model <- prune(rpart_tree_model, cp = rpart_tree_model$cptable[which.min(abs(rpart_tree_model$cptable[, "nsplit"] + 1 -2)), "CP"])
#pruned_rpart_tree_model = prune(rpart_tree_model, cp = optimal_cp)
rpart.plot(pruned_rpart_tree_model)
train_pred_unpruned = predict(rpart_tree_model, train_set, type = "class")
train_pred_pruned = predict(pruned_rpart_tree_model, train_set, type = "class")
train_conf_matrix_unpruned = table(train_set$Purchase, train_pred_unpruned)
train_conf_matrix_pruned = table(train_set$Purchase, train_pred_pruned)
# 훈련 오류율 계산
train_error_rate_unpruned = 1 - sum(diag(train_conf_matrix_unpruned)) / sum(train_conf_matrix_unpruned)
train_error_rate_pruned = 1 - sum(diag(train_conf_matrix_pruned)) / sum(train_conf_matrix_pruned)
cat("training error rates(unpruned) : ", train_error_rate_unpruned, " \n")
## training error rates(unpruned) : 0.1697878
cat("training error rates(pruned) : " , train_error_rate_pruned, " \n")
## training error rates(pruned) : 0.1972534
train set에 대해서는 pruned_model이 unpruned_model 보다 error rate이 높게 나타났다.
test_pred_unpruned = predict(rpart_tree_model, test_set, type = "class")
test_pred_pruned = predict(pruned_rpart_tree_model, test_set, type = "class")
test_conf_matrix_unpruned = table(test_set$Purchase, test_pred_unpruned)
test_conf_matrix_pruned = table(test_set$Purchase, test_pred_pruned)
# 훈련 오류율 계산
test_error_rate_unpruned = 1 - sum(diag(test_conf_matrix_unpruned)) / sum(test_conf_matrix_unpruned)
test_error_rate_pruned = 1 - sum(diag(test_conf_matrix_pruned)) / sum(test_conf_matrix_pruned)
test_conf_matrix_unpruned
## test_pred_unpruned
## CH MM
## CH 17 4
## MM 4 12
test_conf_matrix_pruned
## test_pred_pruned
## CH MM
## CH 17 4
## MM 5 11
cat("test error rates(unpruned) : ", test_error_rate_unpruned, " \n")
## test error rates(unpruned) : 0.2162162
cat("test error rates(pruned) : " , test_error_rate_pruned, " \n")
## test error rates(pruned) : 0.2432432
test set에 대해서도 역시 pruned model이 높은 error rate을 보였다. test data가 충분하지 않아서 발생한 문제라고 추측한다. confusion matrix를 통해 정성적으로 파악해보면, 유의미한 차이를 찾을 수 없었다.
#split the data into two part.(train set : test set)
data("Caravan")
Caravan = na.omit(Caravan)
dim(Caravan)
## [1] 5822 86
set.seed(123)
indices = sample(1:nrow(Caravan))
train_indices = indices[1:1000]
test_indices = indices[1001:length(indices)]
trainSet = Caravan[train_indices, ]
testSet = Caravan[test_indices, ]
trainSet$Purchase = as.factor(ifelse(trainSet$Purchase == "Yes", 1, 0))
testSet$Purchase = as.factor(ifelse(testSet$Purchase == "Yes", 1, 0))
nzv = nearZeroVar(trainSet[, -which(names(trainSet) == "Purchase")], saveMetrics = TRUE)
nzv_vars = rownames(nzv[nzv$nzv, ])
trainSet = trainSet[, !(names(trainSet) %in% nzv_vars)]
testSet = testSet[, !(names(testSet) %in% nzv_vars)]
train_control = trainControl(method = "cv", number = 5)
boost.model = train(Purchase ~ .,
data = trainSet,
method = "gbm",
trControl = train_control,
verbose = FALSE,
tuneGrid = expand.grid(n.trees = 1000,
interaction.depth = 1,
shrinkage = 0.01,
n.minobsinnode = 10))
print(boost.model)
## Stochastic Gradient Boosting
##
## 1000 samples
## 51 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 799, 800, 800, 801, 800
## Resampling results:
##
## Accuracy Kappa
## 0.9330043 -0.001874549
##
## Tuning parameter 'n.trees' was held constant at a value of 1000
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
importance = varImp(boost.model, scale = FALSE)
print(importance)
## gbm variable importance
##
## only 20 most important variables shown (out of 51)
##
## Overall
## PPERSAUT 68.350
## MOSTYPE 42.013
## MINK4575 30.811
## APERSAUT 21.938
## MGODPR 20.061
## MBERMIDD 18.859
## PBRAND 18.750
## MOSHOOFD 15.795
## MOPLLAAG 11.391
## MINK3045 10.814
## MINK7512 10.052
## MINKGEM 9.807
## MZFONDS 8.662
## MGODOV 8.558
## MSKB2 8.478
## MGODGE 8.384
## MINKM30 8.031
## MSKB1 7.899
## MFWEKIND 7.430
## MBERZELF 6.748
Caravan data set에서 Purchase 변수를 예측하는 데에 있어서 PPERSAUT 변수가 가장 중요한 것으로 나타난다.